import sys
import csv
from unidecode import unidecode
import re
import unicodedata as ud

def remove_non_ascii(text):
    return unidecode(unicode(text, encoding = "utf-8"))

def main():

	name_list = []
	with open("./firstname_db/firstname.csv", 'rt') as f:
		reader = csv.reader(f, delimiter=',')
		for row in reader:
			name_list.append(row)
	
	nickname_list = []
	with open("./firstname_db/firstname_nickname.csv", 'rt') as f2:
		reader2 = csv.reader(f2, delimiter=',')
		for row2 in reader2:
			nickname_list.append(row2)

	mFile = open("Twitter_user_info_file","rb")
	mout = open("Twitter_users_gender_prob.csv","wb")

	for mRow in mFile:
		firstname = ''
		data = mRow.strip().split('\t')
		### Get Twitter user name: data[2]
		names = data[2]
		temp=remove_non_ascii(names)
		name_temp = re.sub(r'[^\x00-\x7F]+','', temp)
		name_temp2 = re.sub(r'[^\w]', ' ', name_temp)
		name_pre = name_temp2.strip().split(' ')

		if len(name_pre) > 2:
			if '.' in name_pre[0] or len(name_pre[0]) < 2:
				firstname = name_pre[1]
			else:
				firstname = name_pre[0]
		elif len(name_pre) == 2:
			firstname = name_pre[0]
		else:
			firstname = name_pre[0]

		mFirstName = firstname.replace('.','').upper()
		for ele in name_list:
			if ele[0] == mFirstName:
				f = float(ele[2])
				m = float(ele[1])
				fm = f + m
				if fm > 0.0:
					mout.write(mRow.rstrip() + "\t" + str(f/fm) + '&' + str(m/fm) + "\n")
				else:
					mout.write(mRow.rstrip() + "\t" + "xxx" + "\n")
				break
		else:
			### 'name not found match'
			for ele2 in nickname_list:
				if ele2[0] == mFirstName:
					ff = float(ele2[2])
					mm = float(ele2[1])
					ffmm = ff + mm
					if ffmm > 0.0:
						mout.write(mRow.rstrip() + "\t" + str(ff/ffmm) + '&' + str(mm/ffmm) + "\n")
					else:
						mout.write(mRow.rstrip() + "\t" + "xxx&xxx" + "\n")
					break
			else:
				mout.write(mRow.rstrip() + "\t" + "xxx&xxx" + "\n")

	print 'program finished'

if __name__=='__main__':
	print "program running"
	main()
